import pandas as pd
import numpy as np
import random
from functions import *



#######################################################
#Preproccessing
#######################################################

random.seed(50)
np.random.seed(50)

#Loading PPP data
PPP_df = pd.read_csv('150k_plus_cities.csv')

#Removing Nans and unknown States
PPP_df = PPP_df[PPP_df['City'].notna()]
PPP_df = PPP_df[PPP_df['State'] != 'XX']

#CSV we use for correctly spelled cities, we make the city names upper case to be in line with the PPP data
spelling_df = pd.read_csv('uscities.csv')
spelling_df['city'] = spelling_df['city'].apply(lambda name: name.upper())

#Creating a list of states by their abreviations
states = list(PPP_df['State'].unique())

#city_df will eventuatally be the dataset of all considered string pairs
city_df = pd.DataFrame({})

for state in states:
    print(state, ':')
    #the set of cities in this state per the PPP data
    PPP_this_state = set(PPP_df.loc[PPP_df['State'] == state]['City'])
    #the set of cities in this state per the census data
    this_state_real = set(spelling_df.loc[spelling_df['state_id'] == state]['city'])
    #removing correctly spelled cities from PPP data
    mispelled = PPP_this_state - this_state_real
    num_combos = len(mispelled)*len(this_state_real)
    print('computing', num_combos, 'pairs:')
    print('')
    if num_combos > 0:
        #calculating string distance metrics for all combos of city names in the state
        all_combos = precompute_distances(list(mispelled), list(this_state_real), dump_duplicates=True,combination_type='dif_lists')
        city_df = pd.concat([city_df, all_combos])

city_df = city_df.drop_duplicates()
#city_df.to_csv('Cities_all_pairs.csv',index=False)


#######################################################
#Reading in Saved Data and Models
#######################################################

#base_training is the ~70,000 labeled string pairs from the amicus-bonica train set 
base_training = pd.read_csv('full_train_set.csv')

#Loading city data
city_df = pd.read_csv('cities_all_pairs.csv')

#Loading basic_model, which is trained just on the base_training data
basic_model = pkl.load(open("iter0_model.p", "rb"))

#The number of string pairs we will use in our HITL step
num_matches = 500

#######################################################
#Human-In-The-Loop Step
#######################################################

#using basic_model to find a ranking for the city pairs, and resetting the indexes
ranked_matches = get_predictions(basic_model, city_df, 'basic_score')
ranked_matches.reset_index(drop=True, inplace=True)

#best_matches is the 500 string pairs which scored the highest according to the basic model, the string
#pairs which will be used for the HITL step
best_matches = ranked_matches.iloc[0:num_matches]

print('Human in the Loop Step:')
print('Reading in pre-labeled pairs from this iteration...')

#asking human about the top 500 pairs
#match_pairs = np.array(best_matches[['amicus', 'bonica']])
#ground_truth = ask_about_matches(match_pairs)

#######################################################
#Training New Model on HITL Loop Data
#######################################################

#best_matches['label'] = ground_truth
#best_matches.to_csv('citis_hitl_pairs.csv',index=False)

#reading in feature importances table in order to add to it
feature_importances = pd.read_csv('feature_importances_by_iteration.csv')
feature_importances = feature_importances.loc[feature_importances['Task'] != 'City Name Correction']

#reading in labeled HITL pairs, called best_matches
best_matches = pd.read_csv('cities_hitl_pairs.csv')

num_positives = best_matches.loc[best_matches['label'] == 1].shape[0]
num_negatives = best_matches.loc[best_matches['label'] == 0].shape[0]

#concating the base training set and best_matches
new_train = pd.concat([base_training, best_matches])

#training new model
X_train = new_train[['cosine', 'jaccard', 'levenshtein', 'lcsstr', 'overlap']]
y_train = new_train['label']
random.seed(30)
np.random.seed(30)
model, feat_imports = train(X_train, y_train)

new_row = {'Iteration Number': 1, 
           'Task': 'City Name Correction',
          'cosine': feat_imports[0],
           'jaccard':feat_imports[1],
           'levenshtein':feat_imports[2],
           'lcsstr':feat_imports[3],
           'overlap':feat_imports[4],
          'num_matches': num_positives,
          'num_negatives': num_negatives}

feature_importances = feature_importances.append(new_row, ignore_index=True)
feature_importances.to_csv('feature_importances_by_iteration.csv', index = False)

#######################################################
#Saving Files
#######################################################

#saving and loading model
pkl.dump(model, open("cities_model.p", "wb"))
model = pkl.load(open("cities_model.p", "rb"))

#Getting and saving final results
test_set = ranked_matches.loc[num_matches:ranked_matches.shape[0]]
ranked_matches = get_predictions(model, test_set, 'hitl_score')
ranked_matches.to_csv('Cities_Final_Results.csv', index=False)